When removing the gene ARHGAP11B (ENSG00000187951) in the Gandal dataset there was an important change in the WGCNA modules, and when analysing what could have caused this, we found that this gene had a very big outlier value for a single sample, which we think altered the values of the other genes during the vst transformation enough to alter the modules by the WGNCA algorithm.

Since the preprocessing pipeline did not catch this behaviour and it ended up causing problems in downstream analysis, we want to see how often we can find these type of behaviours in the data, if/how much they alter the results of WGCNA modules, and if it was just a behaviour specific to the Gandal dataset or if it can be found in others as well.

Note: This ARHGAP11B gene is not in this dataset

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggExtra) ; library(ggpubr)
library(WGCNA)
library(expss)
library(knitr)
library(polycor)
# Load preprocessed data to get DE information and the genes that weren't filtered during preprocessing
load('./../../AllRegions/Data/filtered_raw_data.RData')
datExpr_filtered = datExpr
datGenes_filtered = datGenes
datMeta_filtered = datMeta

load('./../../AllRegions/Data/preprocessed_data.RData')
DE_info = DE_info %>% data.frame %>% mutate('ID' = rownames(datExpr), 'DE' = padj<0.05) %>%
          dplyr::select(ID, log2FoldChange, padj, DE)


# Load original expression datasets
datExpr = read.delim('./../Data/datExpr.csv', row.names=1)
rownames(datExpr) = substr(rownames(datExpr), 1, 15)

# Filtering genes: These filters would be the same, so I'll just keep the genes present in the 
# filtered dataset instead of repeating the whole filtering
datExpr = datExpr[rownames(datExpr) %in% rownames(datExpr_filtered),colnames(datExpr) %in% datMeta$ID]
datGenes = datGenes_filtered %>% filter(ensembl_gene_id %in% rownames(datExpr_filtered))

rm(datExpr_filtered, datGenes_filtered, dds)

Original z-score metric


\(metric_i = max_j \frac{|x_{i,j} - mean(x_i)|}{sd(x_i)}\)

z_scores = datExpr %>% apply(1, function(x) abs(x-mean(x))/sd(x)) %>% t %>% data.frame


Outlier genes


Genes with the highest z-score in any of its entries

max_z_scores = data.frame('ID' = rownames(z_scores), 'max_z_score' = z_scores %>% apply(1, max), 
                          'outlier_sample' = z_scores %>% apply(1, function(x) datMeta$ID[which.max(x)])) %>%
               left_join(DE_info, by = 'ID')
summary(max_z_scores$max_z_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.559   4.817   5.889   5.901   6.918  10.089
p = max_z_scores %>% ggplot(aes(ID, max_z_score+1, color = DE)) + geom_point(alpha=0.3) + 
                           xlab('Genes') + ylab('Max |Z-score|') + theme_minimal() + scale_y_log10() +
                           ggtitle('Max|z-score| value for each gene') +
                           theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(),
                                 legend.position='none', panel.grid.major = element_blank())

ggExtra::ggMarginal(p, type = 'density', margins='y', groupColour=TRUE, groupFill=TRUE, size=10)

rm(p)

Top 20 genes

  • These genes have usually a signle outlier sample

  • Most of the outliers correspond to ASD samples but there are some from Control samples as well

  • This values don’t seem to be as extreme as the ones found in the Gandal dataset

  • Standardising the max_z_score value of each gene, all of the 20 top genes would be considered outliers (more than 2 sd from the average max_z_score value)

plot_function = function(i){
  i = 3*i-2
  plot_list = list()
  for(j in 1:3){
    plot_data = data.frame('sample' = colnames(datExpr), 'expr' = t(datExpr[top_genes$ID[i+j-1],]),
                           'Diagnosis' = datMeta$Diagnosis)
    colnames(plot_data)[2] = 'expr'
    plot_list[[j]] = ggplotly(plot_data %>% ggplot(aes(sample, expr, color=Diagnosis)) + 
                              geom_point() + theme_minimal() + 
                              theme(legend.position='none', axis.text.x=element_blank(), axis.ticks.x=element_blank()))
  }
  p = subplot(plot_list, nrows=1) %>% layout(annotations = list(
    list(x = 0.1 , y = 1.05, text =  top_genes$hgnc_symbol[i], showarrow = F, xref='paper', yref='paper'),
    list(x = 0.5 , y = 1.05, text = top_genes$hgnc_symbol[i+1], showarrow = F, xref='paper', yref='paper'),
    list(x = 0.9 , y = 1.05, text = top_genes$hgnc_symbol[i+2], showarrow = F, xref='paper', yref='paper')))
  
  return(p)
}

top_genes = max_z_scores %>% dplyr::top_n(n=21, w=max_z_score) %>% arrange(desc(max_z_score)) %>% 
            left_join(datGenes, by=c('ID' = 'ensembl_gene_id')) %>%
            mutate('StandMaxZscore' = (max_z_score-mean(max_z_scores$max_z_score))/sd(max_z_scores$max_z_score)) %>%
            dplyr::select(ID, hgnc_symbol, log2FoldChange, padj, DE, max_z_score, StandMaxZscore)

kable(top_genes, caption='Top 20 genes with the highest max z-score value')
Top 20 genes with the highest max z-score value
ID hgnc_symbol log2FoldChange padj DE max_z_score StandMaxZscore
ENSG00000256618 MTRNR2L1 1.9916981 0.0401708 TRUE 10.089062 3.064010
ENSG00000173110 HSPA6 -1.0669636 0.2221616 FALSE 10.078125 3.056009
ENSG00000175592 FOSL1 0.5073645 0.6024915 FALSE 10.002990 3.001047
ENSG00000197893 NRAP 0.4561690 NA NA 9.940447 2.955295
ENSG00000161298 ZNF382 0.2422453 NA NA 9.932058 2.949158
ENSG00000006327 TNFRSF12A 0.0166290 0.9874780 FALSE 9.923768 2.943094
ENSG00000155975 VPS37A 0.3799827 0.1502730 FALSE 9.812672 2.861825
ENSG00000139629 GALNT6 0.1749755 0.5941066 FALSE 9.811609 2.861047
ENSG00000100362 PVALB -0.8270296 0.0335412 TRUE 9.735750 2.805555
ENSG00000142765 SYTL1 0.6009630 NA NA 9.712743 2.788725
ENSG00000121410 A1BG -0.0890489 NA NA 9.704083 2.782390
ENSG00000133800 LYVE1 -0.2529720 0.6515612 FALSE 9.665992 2.754526
ENSG00000174576 NPAS4 -0.5234097 0.5679929 FALSE 9.664777 2.753637
ENSG00000166246 C16orf71 0.0102824 NA NA 9.631361 2.729192
ENSG00000100122 CRYBB1 -0.0385638 0.9461326 FALSE 9.616935 2.718640
ENSG00000132518 GUCY2D -0.5383085 NA NA 9.607344 2.711623
ENSG00000149257 SERPINH1 0.3468448 0.5449874 FALSE 9.602366 2.707982
ENSG00000123119 NECAB1 0.0920081 0.7425828 FALSE 9.592995 2.701127
ENSG00000166278 C2 0.6753946 0.0365333 TRUE 9.590246 2.699116
ENSG00000243284 VSIG8 -0.2627459 NA NA 9.586191 2.696150
ENSG00000169313 P2RY12 -0.2472338 0.7498587 FALSE 9.573445 2.686826
plot_function(1)
plot_function(2)
plot_function(3)
plot_function(4)
plot_function(5)
plot_function(6)
plot_function(7)


Outlier samples


If there was no dependence between the samples and the outliers, then the maximum |z-scores| would be evenly distributed along the 106 samples, with ~130 maximum z-scores in each sample

  • Most of the outliers by gene are grouped in a single sample (s69 Occipital) and then on s7 Occipital


Defining outlier samples: Samples with a standardised Count > 2 (everything above the dotted line)

samples_info = max_z_scores$outlier_sample %>% table %>% data.frame %>% 
               dplyr::rename(Sample_ID = '.', Count = 'Freq') %>%
               left_join(datMeta, by = c('Sample_ID' = 'ID'))  %>% arrange(desc(Count)) %>% 
               mutate('StandardisedCount' = round((Count-mean(Count))/sd(Count),2)) %>%
               dplyr::select(Sample_ID, Count, StandardisedCount, Subject_ID, Gender, Age, brain_lobe, SiteHM, Diagnosis)

ggplotly(samples_info %>% ggplot(aes(Subject_ID, Count, fill=Diagnosis, color=brain_lobe)) + 
         geom_bar(stat = 'identity', size = 0, position = position_dodge2(preserve = 'single')) + xlab('Subjects') +
         geom_hline(yintercept = nrow(max_z_scores)/nrow(datMeta), color = 'gray') + 
         geom_hline(yintercept = mean(samples_info$Count)+2*sd(samples_info$Count), color = 'gray', linetype = 'dotted') + 
         ggtitle('Number of genes for which their maximum |z-score| value fell in each Sample') + 
         theme_minimal() + theme(axis.text.x = element_text(angle = 90, hjust = 1),
                                 legend.position = 'none'))
kable(samples_info %>% filter(Count>nrow(max_z_scores)/nrow(datMeta)), 
      caption = 'Samples with more max z-scores than expected by random assignment')
Samples with more max z-scores than expected by random assignment
Sample_ID Count StandardisedCount Subject_ID Gender Age brain_lobe SiteHM Diagnosis
ba19.s69 8264 9.91 s69 F 18 Occipital M CTL
ba19.s7 2099 2.40 s7 M 39 Occipital H ASD
ba10.s72 653 0.64 s72 M 13 Frontal M CTL
ba19.s6 523 0.48 s6 M 82 Occipital H ASD
ba10.s14 377 0.30 s14 M 2 Frontal H ASD
ba10.s11 262 0.16 s11 F 18 Frontal H ASD
ba10.s73 234 0.13 s73 M 16 Frontal M CTL
ba19.s2 225 0.12 s2 M 27 Occipital H ASD
ba10.s85 224 0.12 s85 M 17 Frontal M CTL


To see if the samples classified as outliers have a different general behaviour than the other samples (not just on the genes which had their maximum value in them), we calculate the z-score value each gene has on each sample and see their distribution

The outlier samples seem to have a higher distribution of z-scores along all of the genes when comparing them to random samples, not just the one which corresponded to the max|z-score|

plot_data = data.frame('ID' = rownames(datExpr))
outlier_samples = samples_info$Sample_ID[samples_info$Count>mean(samples_info$Count)+2*sd(samples_info$Count)]

# Calculate z-score of each gene for the outlier samples
for(s in outlier_samples){
  outlier_idx = which(datMeta$ID == s)
  z_scores = apply(datExpr, 1, function(x) (abs(x[outlier_idx]-mean(x)))/sd(x))
  plot_data = cbind(plot_data, z_scores)
}
colnames(plot_data)[-1] = as.character(outlier_samples)

# Select random samples for comparison
set.seed(123)
rand_samp_1 = sample(datMeta$ID[! datMeta$ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_1 = which(datMeta$ID==rand_samp_1)
set.seed(124)
rand_samp_2 = sample(datMeta$ID[! datMeta$ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_2 = which(datMeta$ID==rand_samp_2)
set.seed(125)
rand_samp_3 = sample(datMeta$ID[! datMeta$ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_3 = which(datMeta$ID==rand_samp_3)
cat(paste0('Using random samples ', rand_samp_1, ', ', rand_samp_2, ', ', rand_samp_3, ' as a reference'))
## Using random samples ba19.s35, ba19.s75, ba19.s34 as a reference
z_func = function(x, rand_samp_idx) return(abs(x[rand_samp_idx]-mean(x))/sd(x))

# Transform data for plotting
levels = c(unique(as.character(samples_info$Sample_ID[samples_info$StandardisedCount>2])),
           'Random Sample 1','Random Sample 2','Random Sample 3')
plot_data_melt = plot_data %>% mutate('Random Sample 1'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_1)),
                                      'Random Sample 2'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_2)),
                                      'Random Sample 3'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_3))) %>%
                 melt()  %>% mutate(variable = factor(variable, levels=levels, ordered=T)) %>%
                 mutate(ID = case_when(variable == 'Random Sample 1' ~ rand_samp_1,
                                       variable == 'Random Sample 2' ~ rand_samp_2,
                                       variable == 'Random Sample 3' ~ rand_samp_3,
                                       TRUE ~ as.character(variable))) %>%
                 left_join(datMeta %>% dplyr::select(ID, Diagnosis), by = 'ID')

# Plot
ggplotly(plot_data_melt %>% ggplot(aes(variable, value+1, fill=Diagnosis)) + geom_boxplot() +
         xlab('Samples') +ylab('|z-scores|') + scale_y_log10() + theme_minimal() + 
         theme(axis.text.x = element_text(angle = 90, hjust = 1)))
rm(plot_data, plot_data_melt, s, outlier_idx, z_scores, rand_samp_1, rand_samp_idx_1,
   rand_samp_2, rand_samp_idx_2, rand_samp_3, rand_samp_idx_3, levels, z_func)


Metric currently used to filter samples vs z-score


Currently used metric:

  • Create weighted sample correlation network

  • Calculate the connectivity of each node

  • Normalise connectivity of the nodes (samples)

  • Filter nodes (samples) with a connectivity lower than -2 (low connectivity, with a distance larger than 2 sd to the mean connectivity of the network)


Notes:

  • The methods don’t agree in any samples
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))

original_z_score = data.frame('ID' = gsub('X','',names(z.ku)), 'z_score_orig' = as.vector(z.ku)) %>%
                   left_join(datMeta, by = 'ID') %>% 
                   dplyr::select(ID, z_score_orig)

plot_data = samples_info %>% left_join(original_z_score, by = c('Sample_ID'='ID')) %>% 
            mutate('norm_count' = (Count-mean(Count))/sd(Count))

ggplotly(plot_data %>% ggplot(aes(z_score_orig, norm_count, color=Diagnosis)) + 
         geom_point(alpha=0.8, aes(id=Sample_ID)) + 
         geom_vline(xintercept = -2, color='gray', linetype = 'dotted') + 
         geom_hline(yintercept = 2, color='gray', linetype = 'dotted') + 
         xlab('Currently used metric') + ylab('Standardised number of max |z-score| genes in sample') + 
         theme_minimal())
rm(absadj, netsummary, ku, z.ku, original_z_score)

This time it looks like the original metric doesn’t agree as mucho with the PCA plot of the genes, but our method doesn’t do a very good job either

pca = datExpr %>% t %>% prcomp

pca_data = data.frame('ID' = gsub('X','',colnames(datExpr)), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
                 left_join(datMeta, by='ID') %>%
           dplyr::select(ID, PC1, PC2) %>% left_join(plot_data, by = c('ID'='Sample_ID'))

p1 = ggplotly(pca_data %>% ggplot(aes(PC1, PC2, color=-StandardisedCount)) + 
              geom_point(alpha=0.8, aes(id=ID)) +
              xlab(paste0('PC1 (', round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (', round(100*summary(pca)$importance[2,2],1),'%)')) +
              scale_color_viridis() + theme_minimal() + theme(legend.position = 'none') + 
              coord_fixed())

p2 = ggplotly(pca_data %>% ggplot(aes(PC1, PC2, color=z_score_orig)) +
              geom_point(alpha=0.8, aes(id=ID)) +
              xlab(paste0('PC1 (', round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (', round(100*summary(pca)$importance[2,2],1),'%)')) +
              ggtitle('Outlier genes in each sample (left) and Original sample outlier metric (right)') +
              scale_color_viridis() + 
              theme_minimal() + theme(legend.position = 'none') + coord_fixed())

subplot(p1, p2, nrows=1)
rm(pca, pca_data, p1, p2)







Robust z-score metric


\(metric_i = max_j \frac{|x_{i,j} - median(x_i)|}{mad(x_i)}\)

z_scores = datExpr %>% apply(1, function(x) abs(x-median(x))/mad(x)) %>% t %>% data.frame


Outlier genes


Genes with the highest z-score in any of their entries

max_z_scores = data.frame('ID' = rownames(z_scores), 'max_z_score' = z_scores %>% apply(1, max), 
                          'outlier_sample' = z_scores %>% apply(1, function(x) datMeta$ID[which.max(x)])) %>%
               left_join(DE_info, by = 'ID')

The dotted line indicates the value for the gene we removed ARHGAP11B (ENSG00000187951)

summary(max_z_scores$max_z_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.83    8.08   11.69   14.49   17.17 2179.45
p = max_z_scores %>% ggplot(aes(ID, max_z_score+1, color = DE)) + geom_point(alpha=0.3) + 
                           xlab('Genes') + ylab('Max |Z-score|') + theme_minimal() + scale_y_log10() +
                           ggtitle('Max|z-score| value for each gene') +
                           theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(),
                                 legend.position='none', panel.grid.major = element_blank())

ggExtra::ggMarginal(p, type = 'density', margins='y', groupColour=TRUE, groupFill=TRUE, size=10)

rm(p)

Top 20 genes

  • These genes have more than one outlier sample

  • Most of the outliers correspond to ASD samples but it’s not as extreme as with the Gandal dataset

  • There no longer seem to be a lot of DE genes

  • The standardised max_z_score values of each of the top genes are huge! Probably because of the really long right tail of this metric’s distribution

plot_function = function(i){
  i = 3*i-2
  plot_list = list()
  for(j in 1:3){
    plot_data = data.frame('sample' = colnames(datExpr), 'expr' = t(datExpr[top_genes$ID[i+j-1],]),
                           'Diagnosis' = datMeta$Diagnosis)
    colnames(plot_data)[2] = 'expr'
    plot_list[[j]] = ggplotly(plot_data %>% ggplot(aes(sample, expr, color=Diagnosis)) + 
                              geom_point() + theme_minimal() + 
                              theme(legend.position='none', axis.text.x=element_blank(), axis.ticks.x=element_blank()))
  }
  p = subplot(plot_list, nrows=1) %>% layout(annotations = list(
    list(x = 0.1 , y = 1.05, text =  top_genes$hgnc_symbol[i], showarrow = F, xref='paper', yref='paper'),
    list(x = 0.5 , y = 1.05, text = top_genes$hgnc_symbol[i+1], showarrow = F, xref='paper', yref='paper'),
    list(x = 0.9 , y = 1.05, text = top_genes$hgnc_symbol[i+2], showarrow = F, xref='paper', yref='paper')))
  
  return(p)
}

top_genes = max_z_scores %>% dplyr::top_n(n=21, w=max_z_score) %>% arrange(desc(max_z_score)) %>% 
            left_join(datGenes, by=c('ID' = 'ensembl_gene_id')) %>%
            mutate('StandMaxZscore' = (max_z_score-mean(max_z_scores$max_z_score))/sd(max_z_scores$max_z_score)) %>%
            dplyr::select(ID, hgnc_symbol, log2FoldChange, padj, DE, max_z_score, StandMaxZscore)

kable(top_genes, caption='Top 20 genes with the highest max z-score value')
Top 20 genes with the highest max z-score value
ID hgnc_symbol log2FoldChange padj DE max_z_score StandMaxZscore
ENSG00000173110 HSPA6 -1.0669636 0.2221616 FALSE 2179.4483 73.587340
ENSG00000256618 MTRNR2L1 1.9916981 0.0401708 TRUE 1675.1691 56.446812
ENSG00000196136 SERPINA3 1.4141957 0.0716810 FALSE 1632.2280 54.987237
ENSG00000174576 NPAS4 -0.5234097 0.5679929 FALSE 685.5044 22.807955
ENSG00000175592 FOSL1 0.5073645 0.6024915 FALSE 300.8229 9.732572
ENSG00000108691 CCL2 -0.7728324 0.3445928 FALSE 192.2299 6.041479
ENSG00000026508 CD44 0.6639284 0.2819296 FALSE 188.7825 5.924301
ENSG00000255823 MTRNR2L8 -1.0174587 0.2073937 FALSE 165.1880 5.122321
ENSG00000104951 IL4I1 0.6933161 0.3081869 FALSE 161.6248 5.001209
ENSG00000006327 TNFRSF12A 0.0166290 0.9874780 FALSE 147.7135 4.528360
ENSG00000198502 HLA-DRB5 1.0225666 0.2127074 FALSE 147.2638 4.513076
ENSG00000162645 GBP2 0.7160785 0.1557531 FALSE 130.0482 3.927916
ENSG00000133048 CHI3L1 0.9654080 0.0259421 TRUE 127.7419 3.849521
ENSG00000025708 TYMP 0.8388151 0.0179278 TRUE 119.3012 3.562622
ENSG00000155975 VPS37A 0.3799827 0.1502730 FALSE 118.1903 3.524862
ENSG00000139629 GALNT6 0.1749755 0.5941066 FALSE 115.9066 3.447239
ENSG00000169313 P2RY12 -0.2472338 0.7498587 FALSE 113.7828 3.375052
ENSG00000197249 SERPINA1 0.7627801 0.0788746 FALSE 112.9367 3.346293
ENSG00000100362 PVALB -0.8270296 0.0335412 TRUE 111.2947 3.290480
ENSG00000163682 RPL9 0.0460735 0.9473003 FALSE 109.2164 3.219838
ENSG00000143546 S100A8 0.6803657 0.4131652 FALSE 108.6543 3.200733
plot_function(1)
plot_function(2)
plot_function(3)
plot_function(4)
plot_function(5)
plot_function(6)
plot_function(7)


Outlier samples


The distribution of the genes with the max |z-score| is exactly the same as in the previous section because the two metrics are a monotonic transformation of the entries by row (by gene), so the maximum of each row is reached in the same sample in both metrics


To see if the samples classified as outliers have a different general behaviour than the other samples (not just on the genes which had their maximum value in them), we calculate the z-score value each gene has on each sample and see their distribution.

The outlier samples seem to have a higher distribution of z-scores along all of the genes when comparing them to random samples, not just the one which corresponded to the max|z-score|

The results are very similar to the ones from the first metric

plot_data = data.frame('ID' = rownames(datExpr))
outlier_samples = samples_info$Sample_ID[samples_info$Count>mean(samples_info$Count)+2*sd(samples_info$Count)]

# Calculate z-score of each gene for the outlier samples
for(s in outlier_samples){
  outlier_idx = which(datMeta$ID == s)
  z_scores = apply(datExpr, 1, function(x) (abs(x[outlier_idx]-mean(x)))/sd(x))
  plot_data = cbind(plot_data, z_scores)
}
colnames(plot_data)[-1] = as.character(outlier_samples)

# Select random samples for comparison
set.seed(123)
rand_samp_1 = sample(datMeta$ID[! datMeta$ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_1 = which(datMeta$ID==rand_samp_1)
set.seed(124)
rand_samp_2 = sample(datMeta$ID[! datMeta$ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_2 = which(datMeta$ID==rand_samp_2)
set.seed(125)
rand_samp_3 = sample(datMeta$ID[! datMeta$ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_3 = which(datMeta$ID==rand_samp_3)
cat(paste0('Using random samples ', rand_samp_1, ', ', rand_samp_2, ', ', rand_samp_3, ' as a reference'))
## Using random samples ba19.s35, ba19.s75, ba19.s34 as a reference
z_func = function(x, rand_samp_idx) return(abs(x[rand_samp_idx]-mean(x))/sd(x))

# Transform data for plotting
levels = c(unique(as.character(samples_info$Sample_ID[samples_info$StandardisedCount>2])),
           'Random Sample 1','Random Sample 2','Random Sample 3')
plot_data_melt = plot_data %>% mutate('Random Sample 1'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_1)),
                                      'Random Sample 2'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_2)),
                                      'Random Sample 3'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_3))) %>%
                 melt()  %>% mutate(variable = factor(variable, levels=levels, ordered=T)) %>%
                 mutate(ID = case_when(variable == 'Random Sample 1' ~ rand_samp_1,
                                       variable == 'Random Sample 2' ~ rand_samp_2,
                                       variable == 'Random Sample 3' ~ rand_samp_3,
                                       TRUE ~ as.character(variable))) %>%
                 left_join(datMeta %>% dplyr::select(ID, Diagnosis), by = 'ID')

# Plot
ggplotly(plot_data_melt %>% ggplot(aes(variable, value+1, fill=Diagnosis)) + geom_boxplot() +
         xlab('Samples') +ylab('|z-scores|') + scale_y_log10() + theme_minimal() + 
         theme(axis.text.x = element_text(angle = 90, hjust = 1)))
rm(plot_data, plot_data_melt, s, outlier_idx, z_scores, rand_samp_1, rand_samp_idx_1,
   rand_samp_2, rand_samp_idx_2, rand_samp_3, rand_samp_idx_3, levels, z_func)


Metric currently used to filter samples vs z-score


The plot is exactly the same as with the first metric, since the maximums by row happen in the same columns, so the counts by sample are the same in both methods



Direct comparison between metrics

We know that when using the metrics for sample outlier detection they give exactly the same results, but they seem to vary greatly when using them for gene outlier detection

Using a distance of 3 standard deviations to define outliers in both methods:

plot_data = data.frame('ID' = rownames(datExpr),
                       'orig_z_score' = apply(datExpr, 1, function(x) max(abs(x-mean(x))/sd(x))),
                       'robust_z_score' = apply(datExpr, 1, function(x) max(abs(x-median(x))/mad(x)))) %>%
            mutate(outliers_orig_z_score = orig_z_score > mean(orig_z_score) + 3*sd(orig_z_score),
                   outliers_robust_z_score = robust_z_score > mean(robust_z_score) + 3*sd(robust_z_score),
                   alpha = ifelse(ID == 'ENSG00000187951', 1, 0.2)) %>%
            mutate(Outliers = case_when(outliers_orig_z_score & outliers_robust_z_score ~ 'Both',
                                        outliers_orig_z_score & !outliers_robust_z_score ~ 'Only Original',
                                        !outliers_orig_z_score & outliers_robust_z_score ~ 'Only Robust',
                                        TRUE ~ 'Neither')) %>%
            mutate(Outliers = factor(Outliers, levels = c('Neither','Only Original', 'Only Robust', 'Both')))

plot_data %>% ggplot(aes(orig_z_score, robust_z_score, color=Outliers)) + geom_point(alpha=plot_data$alpha) +
              geom_vline(xintercept = mean(plot_data$orig_z_score) + 3*sd(plot_data$orig_z_score),
                         color = 'gray', linetype = 'dashed') +
              geom_hline(yintercept = mean(plot_data$robust_z_score) + 3*sd(plot_data$robust_z_score),
                         color = 'gray', linetype = 'dashed') +
              geom_smooth(method = 'gam', se = FALSE, color='gray') + 
              ggtitle(paste0('R^2 = ', round(cor(plot_data$orig_z_score, plot_data$robust_z_score)[[1]]^2,2))) +
              scale_y_log10() + scale_color_viridis_d() + xlab('Original max|z-score|') +
              ylab('Robust max|z-score|') + theme_minimal()

table_info = plot_data %>% apply_labels(outliers_orig_z_score = 'Outliers using Original Metric',
                                        outliers_robust_z_score = 'Outliers using Robust Metric')

cro(table_info$outliers_orig_z_score, list(table_info$outliers_robust_z_score, total()))
 Outliers using Robust Metric     #Total 
 FALSE   TRUE   
 Outliers using Original Metric 
   FALSE  13676 19   13695
   TRUE  3   3
   #Total cases  13676 22   13698
rm(table_info)


Relation to DEA Results


Repeating the plot above but colouring genes depending on whether they are DE or not:

  • The robust metric seems to be giving higher values to the DE genes: this could be a problem, for example, for genes which have a strong disregulation in ASD, which could be labelled as outliers and filtered out of the dataset … or maybe it’s the other way around, and the DE algorithm is identifying genes as statistically significant that have many outlier values due to technical artifacts instead of biological significance…
plot_data = data.frame('ID' = rownames(datExpr),
                       'orig_z_score' = apply(datExpr, 1, function(x) max(abs(x-mean(x))/sd(x))),
                       'robust_z_score' = apply(datExpr, 1, function(x) max(abs(x-median(x))/mad(x))),
                       'meanExpr' = log2(rowMeans(datExpr))) %>%
            left_join(DE_info, by = 'ID') %>%
            mutate(outliers_orig_z_score = orig_z_score > mean(orig_z_score) + 3*sd(orig_z_score),
                   outliers_robust_z_score = robust_z_score > mean(robust_z_score) + 3*sd(robust_z_score),
                   alpha = ifelse(ID == 'ENSG00000187951', 1, 0.2))

plot_data %>% ggplot(aes(orig_z_score, robust_z_score, color=DE)) + geom_point(alpha=plot_data$alpha) +
              geom_vline(xintercept = mean(plot_data$orig_z_score) + 3*sd(plot_data$orig_z_score),
                         color = 'gray', linetype = 'dashed') +
              geom_hline(yintercept = mean(plot_data$robust_z_score) + 3*sd(plot_data$robust_z_score),
                         color = 'gray', linetype = 'dashed') +
              geom_smooth(method = 'gam', se = FALSE) + 
              ggtitle(paste0('R^2 = ', round(cor(plot_data$orig_z_score, plot_data$robust_z_score)[[1]]^2,2))) +
              scale_y_log10() + xlab('Original max|z-score|') + ylab('Robust max|z-score|') + theme_minimal()

rm(table_info)


If we order the genes by their max|z-score| and calculate the percentage of DE genes using a rolling average, we can see if there’s a change in this percentage of DE genes for different levels of the metric:

  • In general, as the scores increase, the percentage of DE genes decreases, which makes sense, since the score is measuring a certain kind of noise

  • There is a small increase in the genes with the highest scores in both methods, but it’s quite small

Note: The genes in this plot are ordered by their max|z-score| value, so there are actually two orderings of genes in the plot, one for the original metric and the other for the robust metric. That’s why there are different patterns in the % of DE genes for each metric

w = 2000
sliding_DE = data.frame('window' = 1:(nrow(datExpr)-w), 'original' = 0, 'robust' = 0)
original_sort = plot_data %>% arrange(orig_z_score)
robust_sort = plot_data %>% arrange(robust_z_score)

for(i in 1:nrow(sliding_DE)){
  sliding_DE$original[i] = mean(original_sort$DE[i:(i+w)], na.rm = TRUE)
  sliding_DE$robust[i] = mean(robust_sort$DE[i:(i+w)], na.rm = TRUE)
}

sliding_DE %>% melt(id.vars='window') %>% dplyr::rename(Metric = variable) %>% 
               ggplot(aes(window, 100*value, color=Metric)) + geom_line(alpha=0.3) + 
               geom_hline(yintercept = 100*mean(plot_data$DE, na.rm=TRUE), color = 'gray') + 
               geom_smooth(se=FALSE) + ylab('% DE genes in window') + 
               scale_color_manual(values = c('#e60073','#008080')) + theme_minimal()

rm(w, sliding_DE, original_sort, robust_sort, i)

If we plot now the rolling mean of the |LFC|:

  • For both metrics the LFC increases as the value of the metric increases (this makes sense because noisier samples would give bigger differences)

  • The |LFC| increases more in the robust version of the metric than in the original

w = 2000
sliding_DE = data.frame('window' = 1:(nrow(datExpr)-w), 'original' = 0, 'robust' = 0)
original_sort = plot_data %>% arrange(orig_z_score)
robust_sort = plot_data %>% arrange(robust_z_score)

for(i in 1:nrow(sliding_DE)){
  sliding_DE$original[i] = mean(abs(original_sort$log2FoldChange[i:(i+w)]), na.rm = TRUE)
  sliding_DE$robust[i] = mean(abs(robust_sort$log2FoldChange[i:(i+w)]), na.rm = TRUE)
}

sliding_DE %>% melt(id.vars='window') %>% dplyr::rename(Metric = variable) %>% 
               ggplot(aes(window, 100*value, color=Metric)) + geom_line(alpha=0.3) + 
               geom_hline(yintercept = 100*mean(abs(plot_data$log2FoldChange), na.rm=TRUE), color = 'gray') + 
               geom_smooth(se=FALSE) + ylab('Mean LFC of genes in window') + 
               scale_color_manual(values = c('#e60073','#008080')) + theme_minimal()

rm(w, sliding_DE, original_sort, robust_sort, i)

For the first half of the plot (~windows 0-6000): |LFC| remains roughly constant but %DE decreases steadily

  • Even though the change between Diagnoses remains the same, the increase in outlier samples (reflected in the position of the genes in a higher window) decreases the significance of the comparison, resulting in higher (and no longer significant) p-values

  • This means that the presence of outliers decreases the confidence of the DEA


For the second half of the plot (~windows 6000-1200): The %DE genes continues decreasing but in a less steep manner remains constant using the original metric but increases (maybe exponentially?) using the robust metric. At the same time, the |LFC| increases steadily on both metrics, but at a higher rate in the robust metric

  • For the Original metric we can give a similar conclusion as in the first half of the plot: The more extreme the outliers of a gene are, the more they damage the confidence of the DEA, resulting in genes with very high |LFC| that share the same level of confidence with genes with a not so extreme |LFC|

  • For the Robust metric, both the % of DE genes and the |LFC| increase, which sounds reasonable. The only problem with this is that we don’t know if this increase in DE genes is because of biological signals which are being mistaken for technical effects by the max|z-score|, or technical effects which are confused by biological signals by DESeq2. This problem is specific to this dataset, since consistently, the outlier values correspond to ASD samples


Relation to Level of Expression


If we plot the rolling mean of the mean level of expression of the genes:

  • The level of experssion plays an important role in both of the scores (same pattern found in Gandal)

  • Genes with lower mean expression tend to have larger max|z-score| values

w = 1000
sliding_DE = data.frame('window' = 1:(nrow(datExpr)-w), 'original' = 0, 'robust' = 0)
original_sort = plot_data %>% arrange(orig_z_score)
robust_sort = plot_data %>% arrange(robust_z_score)

for(i in 1:nrow(sliding_DE)){
  sliding_DE$original[i] = mean(original_sort$meanExpr[i:(i+w)], na.rm = TRUE)
  sliding_DE$robust[i] = mean(robust_sort$meanExpr[i:(i+w)], na.rm = TRUE)
}

sliding_DE %>% melt(id.vars='window') %>% dplyr::rename(Metric = variable) %>% 
               ggplot(aes(window, 100*value, color=Metric)) + geom_line(alpha=0.3) + 
               geom_hline(yintercept = 100*mean(plot_data$meanExpr, na.rm=TRUE), color = 'gray') + 
               geom_smooth(se=FALSE) + ylab('Mean Mean Level of Expression in window') + 
               scale_color_manual(values = c('#e60073','#008080')) + theme_minimal()

rm(w, sliding_DE, original_sort, robust_sort, i)

Conclusions